packages <- c(
  "sp.scRNAseq",
  "sp.scRNAseqData",
  "sp.scRNAseqTesting",
  "seqTools",
  "printr",
  "ggthemes",
  "tidyverse"
)
purrr::walk(packages, library, character.only = TRUE)
rm(packages)
#' Reorder an x or y axis within facets
#'
#' Reorder a column before plotting with faceting, such that the values are ordered
#' within each facet. This requires two functions: \code{reorder_within} applied to
#' the column, then either \code{scale_x_reordered} or \code{scale_y_reordered} added
#' to the plot.
#' This is implemented as a bit of a hack: it appends ___ and then the facet
#' at the end of each string.
#'
#' @param x Vector to reorder.
#' @param by Vector of the same length, to use for reordering.
#' @param within Vector of the same length that will later be used for faceting
#' @param fun Function to perform within each subset to determine the resulting
#' ordering. By default, mean.
#' @param sep Separator to distinguish the two. You may want to set this manually
#' if ___ can exist within one of your labels.
#' @param ... In \code{reorder_within} arguments passed on to \code{\link{reorder}}.
#' In the scale functions, extra arguments passed on to
#' \code{\link[ggplot2]{scale_x_discrete}} or \code{\link[ggplot2]{scale_y_discrete}}.
#'
#' @source "Ordering categories within ggplot2 Facets" by Tyler Rinker:
#' \url{https://trinkerrstuff.wordpress.com/2016/12/23/ordering-categories-within-ggplot2-facets/}
#'
#' @examples
#'
#' library(tidyr)
#' library(ggplot2)
#'
#' iris_gathered <- gather(iris, metric, value, -Species)
#'
#' # reordering doesn't work within each facet (see Sepal.Width):
#' ggplot(iris_gathered, aes(reorder(Species, value), value)) +
#'   geom_boxplot() +
#'   facet_wrap(~ metric)
#'
#' # reorder_within and scale_x_reordered work.
#' # (Note that you need to set scales = "free_x" in the facet)
#' ggplot(iris_gathered, aes(reorder_within(Species, value, metric), value)) +
#'   geom_boxplot() +
#'   scale_x_reordered() +
#'   facet_wrap(~ metric, scales = "free_x")
#'
#' @export
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}

#' @rdname reorder_within
#' @export
scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}

#' @rdname reorder_within
#' @export
scale_y_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}
#cpm normalization withoit adding 1 that works on both matrices and tibbles with
#genes as rows and cells as columns
norm <- function(counts) {
  if(all(class(counts) == c("tbl_df", "tbl", "data.frame"))) {
    mat <- counts %>% 
      as.data.frame() %>%
      column_to_rownames("gene") %>%
      as.matrix()
    
    t(t(mat) / colSums(mat) * 10^6) %>%
      matrix_to_tibble("gene")
  } else if(class(counts) == "matrix") {
    t(t(counts) / colSums(counts) * 10^6)
  }
}

The synthetic multiplets are based on the mixing of real synthetic singlets. The algorithm for generating the synthetic multiplets works as follows: The sum of counts is calculated for each gene in two randomly selected singlets from each cell type contributing to the multiplet. The fraction of input into the multiplet can be adjusted so that, e.g. cell type A only contributes half of what cell type B contributes to the multiplet. A vector is then generated where each gene name is represented one time for each of the total counts (e.g. if gene “A” has a sum of counts from all singlets from the cell types contributing to the multiplet of 10, “A” will be repreated 10 times in the vector). This vector is then sampled without replacment. It is sampled x times, where x is equal to the sum of all the counts in all singlets contributing to the multiplet divided by the total number of singlets contributing to the multiplet. Gene names are then counted which provides the raw counts for the newly synthesized multiplet.

Here we utilize the sorted cell line dataset to generate multiplets from the singlets. The synthesized multiplets are then compared to the real multiplets to acertain the quality of the in silico synthesis. We begin by synthesizing one multiplet per cell line combination, i.e. a A375-HCT116, A375-HOS, and HCT116-HOS multiplet.

Synthesize one multiplet per combination

s <- str_detect(colnames(countsSorted2), "^s")
sng <- countsSorted2[, s]
classes <- slice(countsSortedMeta2, match(colnames(sng), countsSortedMeta2$sample))$cellTypes
combos <- apply(combn(unique(classes), 2), 2, sort)
adjustment <- rep(1, length(unique(classes)))
names(adjustment) <- unique(classes)

synth <- sng %>%
  norm() %>%
  syntheticMultipletsFromCounts(., classes, combos, adjustment) %>%
  norm()

mul <- countsSorted2[, !s] %>%
  norm() %>%
  matrix_to_tibble("gene") %>%
  gather(sample, counts, -gene) %>%
  left_join(countsSortedMeta2, by = "sample") %>%
  select(gene, counts, cellTypes, sample)

#randomly select some genes to facilitate plotting
genes <- sample(rownames(sng), 500, replace = FALSE)
concatDat <- gather(synth, cellTypes, synCounts, -gene) %>%
  inner_join(., mul, by = c("gene", "cellTypes")) %>%
  filter(gene %in% genes)

Examine drop-out rate in the real data compared to the synthetic data.

dr <- countsSorted2[, !s] %>%
  norm() %>%
  matrix_to_tibble("gene") %>%
  select(filter(countsSortedMeta2, cellTypes %in% colnames(synth))$sample) %>%
  as.matrix() %>%
  apply(., 2, function(x) length(which(x == 0))) %>%
  `/` (nrow(countsSorted2)) %>%
  mean()

print(paste0("real: ", dr))
## [1] "real: 0.485014576743308"
ds <- synth %>%
  select(-gene) %>%
  as.matrix() %>%
  apply(., 2, function(x) length(which(x == 0))) %>%
  `/` (nrow(countsSorted2)) %>%
  mean()

print(paste0("synthetic: ", ds))
## [1] "synthetic: 0.45452685582991"

Results are shown in several different plots below.

#linear plot
concatDat %>%
  ggplot() +
  geom_point(aes(reorder_within(gene, counts, cellTypes, fun = "mean"), counts), size = 0.1) +
  geom_line(
    data = . %>% select(gene, cellTypes, synCounts) %>% distinct(),
    aes(reorder_within(gene, synCounts, cellTypes, fun = "mean"), synCounts, group = cellTypes),
    colour = "red"
  ) +
  facet_wrap(~cellTypes) +
  scale_x_reordered() +
  theme_few() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  labs(x = "500 randomly selected genes", y = "cpm")

#log plot
concatDat %>%
  ggplot() +
  geom_point(aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1) +
  geom_point(
    data = . %>% select(gene, cellTypes, synCounts) %>% distinct(),
    aes(reorder_within(gene, log2(synCounts + 1), cellTypes, fun = "mean"), log2(synCounts + 1)),
    colour = "red", size = 0.3
  ) +
  facet_wrap(~cellTypes, scales = "free") +
  scale_x_reordered() +
  theme_few() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  labs(x = "500 randomly selected genes", y = "log2(cpm)")

#violin plot
concatDat %>%
  filter(gene %in% sample(concatDat$gene, 50, replace = FALSE)) %>%
  ggplot() +
  geom_violin(aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1) +
  geom_point(aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1) +
  geom_point(
    data = . %>% select(gene, cellTypes, synCounts) %>% distinct(),
    aes(reorder_within(gene, log2(synCounts + 1), cellTypes, fun = "mean"), log2(synCounts + 1)),
    colour = "red", size = 0.3
  ) +
  facet_wrap(~cellTypes, scales = "free") +
  scale_x_reordered() +
  theme_few() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  labs(x = "50 randomly selected genes", y = "log2(cpm)")


Synthesize 10 multiplets per combination.

Here we continue to examine the ability of the synthesized multiplets to accuratley reflect the real multiplets by generating 10 multiplets per combination. It is hoped that this will show that the synthesized multiplets as a whole, accuratley capture the diversity of the real multiplets.

n <- 10
names <- paste(
  apply(combos, 2, paste, collapse = "-"), 
  rep(1:n, each = ncol(combos)), 
  sep = "."
)

synth <- sng %>%
  norm() %>%
  {map(1:n, function(x) {
    syntheticMultipletsFromCounts(., classes, combos, adjustment, seed = x + 9789)
  })} %>%
  reduce(full_join, by = "gene") %>%
  setNames(c("gene", names)) %>%
  norm()

genes <- sample(rownames(sng), 500, replace = FALSE)
concatDat <- synth %>% 
  filter(gene %in% genes) %>%
  gather(typeRep, synCounts, -gene) %>%
  separate(typeRep, c("cellTypes", "sample"), sep = "\\.") %>%
  inner_join(., mul, by = c("gene", "cellTypes"))

Examine drop-out rate.

dr <- countsSorted2[, !s] %>%
  norm() %>%
  matrix_to_tibble("gene") %>%
  select(filter(countsSortedMeta2, cellTypes %in% concatDat$cellTypes)$sample) %>%
  as.matrix() %>%
  apply(., 2, function(x) length(which(x == 0))) %>%
  `/` (nrow(countsSorted2)) %>%
  mean()

print(paste0("real: ", dr))
## [1] "real: 0.485014576743308"
ds <- synth %>%
  select(-gene) %>%
  as.matrix() %>%
  apply(., 2, function(x) length(which(x == 0))) %>%
  `/` (nrow(countsSorted2)) %>%
  mean()

print(paste0("synthetic: ", ds))
## [1] "synthetic: 0.491748894449428"

Plot results.

#linear
concatDat %>%
  ggplot() +
  geom_point(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, counts, cellTypes, fun = "mean"), counts), size = 0.1
  ) +
  geom_line(
    data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
    aes(reorder_within(gene, synCounts, cellTypes, fun = "mean"), synCounts, group = cellTypes, colour = sample.x)
  ) +
  facet_wrap(~cellTypes, scales = "free") +
  theme_few() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  labs(x = "500 randomly selected genes", y = "cpm") +
  guides(colour = FALSE)

#log
concatDat %>%
  ggplot() +
  geom_point(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1
  ) +
  geom_point(
    data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
    aes(reorder_within(gene, log2(synCounts + 1), cellTypes, fun = "mean"), log2(synCounts + 1), colour = sample.x), 
    size = 0.3
  ) +
  facet_wrap(~cellTypes, scales = "free") +
  theme_few() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  labs(x = "500 randomly selected genes", y = "log2(cpm)") +
  guides(colour = FALSE)

#violin
concatDat <- synth %>% 
  filter(gene %in% sample(concatDat$gene, 50, replace = FALSE)) %>%
  gather(typeRep, synCounts, -gene) %>%
  separate(typeRep, c("cellTypes", "sample"), sep = "\\.") %>%
  inner_join(., mul, by = c("gene", "cellTypes"))

concatDat %>%
  filter(cellTypes == "HCT116-HOS") %>%
  ggplot() +
  geom_violin(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1
  ) +
  geom_point(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1
  ) +
  geom_point(
    data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
    aes(reorder_within(gene, log2(synCounts + 1), cellTypes, fun = "mean"), log2(synCounts + 1), colour = sample.x), 
    size = 0.3
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, size = 3)) +
  labs(x = "50 randomly selected genes", y = "log2(cpm)", caption = "HCT116-HOS") +
  guides(colour = FALSE)

concatDat %>%
  filter(cellTypes == "A375-HCT116") %>%
  ggplot() +
  geom_violin(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1
  ) +
  geom_point(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1
  ) +
  geom_point(
    data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
    aes(reorder_within(gene, log2(synCounts + 1), cellTypes, fun = "mean"), log2(synCounts + 1), colour = sample.x), 
    size = 0.3
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, size = 3)) +
  labs(x = "50 randomly selected genes", y = "log2(cpm)", caption = "A375-HCT116") +
  guides(colour = FALSE)

concatDat %>%
  filter(cellTypes == "A375-HOS") %>%
  ggplot() +
  geom_violin(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1
  ) +
  geom_point(
    data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
    aes(reorder_within(gene, log2(counts + 1), cellTypes, fun = "mean"), log2(counts + 1)), size = 0.1
  ) +
  geom_point(
    data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
    aes(reorder_within(gene, log2(synCounts + 1), cellTypes, fun = "mean"), log2(synCounts + 1), colour = sample.x), 
    size = 0.3
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, size = 3)) +
  labs(x = "50 randomly selected genes", y = "log2(cpm)", caption = "A375-HOS") +
  guides(colour = FALSE)

Run tsne with real and synthesized multiplets.

#tsne
set.seed(23435)
countsSorted2[, colnames(countsSorted2) %in% filter(countsSortedMeta2, cellTypes %in% concatDat$cellTypes)$sample] %>%
  norm() %>% 
  matrix_to_tibble("gene") %>%
  full_join(synth, by = "gene") %>%
  as.data.frame() %>%
  column_to_rownames("gene") %>%
  as.matrix() %>%
  `+` (1) %>%
  log2(.) %>%
  pearsonsCor(., select = nTopMax(., 2000)) %>%
  Rtsne::Rtsne(., is_distance = TRUE, perplexity = 15) %>%
  `[[` ('Y') %>% 
  matrix_to_tibble() %>%
  add_column(sample = c(colnames(synth[, -1]), colnames(countsSorted2)[colnames(countsSorted2) %in% filter(countsSortedMeta2, cellTypes %in% concatDat$cellTypes)$sample])) %>%
  mutate(type = if_else(str_detect(sample, "^m"), "real", "synthetic")) %>%
  ggplot() +
  geom_point(aes(V1, V2, colour = type))


Adjusting multiplet contribution

Synthesize 25 multiplets for each combination and reduce the contribution of HOS cells by 0%, 25%, 50%, 75%, or 99%. Calculate the mean gene expression for each and plot together with the real multiplets and corresponding singlets.

###FUNCTIONS
#function to calculate the mean gene expression for a specific cell type. Used 
#previous to heatmap plotting
typeMeans <- function(type) {
  countsSortedMeta2 %>%
  filter(cellTypes == type) %>% 
  pull(sample) %>%
  {countsSorted2[, colnames(countsSorted2) %in% .]} %>%
  norm() %>%
  rowMeans() %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  setNames(c("gene", type)) %>%
  as_tibble()
}

#wrapper to generate and reformat multiplets
generateAdjustedMultiplets <- function(sng, classes, combos, adjustment, name) {
  q.name <- enquo(name)
  sng %>%
    norm() %>%
    {map(1:n, function(x) {
      syntheticMultipletsFromCounts(., classes, combos, adjustment, seed = x + 9789)
    })} %>%
    reduce(full_join, by = "gene") %>%
    setNames(c("gene", names)) %>%
    norm() %>%
    gather(cellTypes, !!q.name, -gene) %>%
    mutate(cellTypes = str_replace(cellTypes, "(.*)\\..*", "\\1")) %>%
    group_by(gene, cellTypes) %>%
    summarize(!! quo_name(q.name) := mean(!!q.name)) %>%
    ungroup()
}
###

#Synthesize multiplets with varying levels of adjusted HOS input
adjustment <- rep(1, length(unique(classes)))
names(adjustment) <- unique(classes)
n <- 25
names <- paste(
  apply(combos, 2, paste, collapse = "-"), 
  rep(1:n, each = ncol(combos)), 
  sep = "."
)

synth <- generateAdjustedMultiplets(sng, classes, combos, adjustment, synCounts)
adjustment[names(adjustment) == "HOS"] <- 0.75
synth7 <- generateAdjustedMultiplets(sng, classes, combos, adjustment, synth7)
adjustment[names(adjustment) == "HOS"] <- 0.5
synth5 <- generateAdjustedMultiplets(sng, classes, combos, adjustment, synth5)
adjustment[names(adjustment) == "HOS"] <- 0.25
synth25 <- generateAdjustedMultiplets(sng, classes, combos, adjustment, synth25)
adjustment[names(adjustment) == "HOS"] <- 0.01
synth01 <- generateAdjustedMultiplets(sng, classes, combos, adjustment, synth01)

#heatmap
#find overexpressed genes in each cell type
genes <- foldChangePerClass(cpm(sng), rename(countsSortedMeta2, class = cellTypes)) %>%
  matrix_to_tibble("gene") %>%
  select(gene, A375, HCT116, HOS) %>%
  gather(cellType, fold, -gene) %>%
  group_by(cellType) %>%
  top_n(25) %>%
  ungroup()

#calculate the mean gene expression for each type of singlet
a375 <- typeMeans("A375") %>% mutate(cellType = "A375") %>% rename(counts = A375)
hct116 <- typeMeans("HCT116") %>% mutate(cellType = "HCT116") %>% rename(counts = HCT116)
hos <- typeMeans("HOS") %>% mutate(cellType = "HOS") %>% rename(counts = HOS)
realSng <- bind_rows(a375, hct116, hos)

#calculate the mean gene expression for each type of multiplet
a375_hct116 <- typeMeans("A375-HCT116") %>% 
  add_column(cellTypes = "A375-HCT116") %>%
  rename(counts = `A375-HCT116`)
a375_hos <- typeMeans("A375-HOS") %>% 
  add_column(cellTypes = "A375-HOS") %>%
  rename(counts = `A375-HOS`)
hct116_hos <- typeMeans("HCT116-HOS") %>% 
  add_column(cellTypes = "HCT116-HOS") %>%
  rename(counts = `HCT116-HOS`)

#concatenate real singlet, real multiplet, and info for genes corresponding to 
#a cell type
real <- bind_rows(a375_hct116, a375_hos, hct116_hos) %>%
  rename(multipletCounts = counts) %>% 
  mutate(
    cellType1 = str_replace(cellTypes, "(.*)-.*", "\\1"), 
    cellType2 = str_replace(cellTypes, ".*-(.*)", "\\1")
  ) %>% 
  inner_join(realSng, by = c("cellType1" = "cellType", "gene" = "gene")) %>% 
  inner_join(realSng, by = c("cellType2" = "cellType", "gene" = "gene")) %>%
  inner_join(genes, by = "gene") %>%
  filter(cellType == cellType1 | cellType == cellType2) %>%
  rename(geneType = cellType)

#join everything and plot
real %>%
  inner_join(synth, by = c("gene", "cellTypes")) %>%
  inner_join(synth7, by = c("gene", "cellTypes")) %>%
  inner_join(synth5, by = c("gene", "cellTypes")) %>%
  inner_join(synth25, by = c("gene", "cellTypes")) %>%
  inner_join(synth01, by = c("gene", "cellTypes")) %>%
  gather(type, cpm, -gene, -(geneType:fold), -(cellTypes:cellType2)) %>%
  mutate(type = case_when(
    type == "counts.x" ~ cellType1, 
    type == "counts.y" ~ cellType2, 
    TRUE ~ type
  )) %>%
  mutate(type = case_when(
    type == "multipletCounts" ~ "Multiplet",
    type == "synCounts" ~ "0%",
    type == "synth7" ~ "25%",
    type == "synth5" ~ "50%",
    type == "synth25" ~ "75%",
    type == "synth01" ~ "99%",
    TRUE ~ type
  )) %>%
  mutate(type = parse_factor(
    type, 
    levels = c("Multiplet", "0%", "25%", "50%", "75%", "99%", "A375", "HCT116", "HOS")
  )) %>%
  arrange(geneType, fold) %>%
  mutate(plotGene = paste0(gene, " (", geneType, ")")) %>%
  mutate(plotGene = parse_factor(plotGene, levels = unique(plotGene))) %>%
  mutate(cellTypes = parse_factor(cellTypes, levels = c("A375-HOS", "HCT116-HOS", "A375-HCT116"))) %>%
  ggplot() +
  geom_tile(aes(type, plotGene, fill = log2(cpm + 1))) +
  facet_wrap(~cellTypes, scale = "free") +
  viridis::scale_fill_viridis() +
  theme_few() +
  theme(
    axis.text.x = element_text(angle = 90),
    axis.title.x = element_blank(),
    legend.position = "top"
  ) +
  labs(y = "Gene (Cell type)")


Synthesize multiplets with mouse data

Synthesize colon and small intestine stem multiplets with the mouse data.

###FUNCTIONS
plotTSNEgene <- function(geneToPlot, slog, cObjMul) {
  s <- slog %>%
    filter(gene == geneToPlot) %>% 
    select(-gene) %>% 
    rename(gene = counts)
  
  getData(cObjMul, "counts.log")[geneToPlot, ] %>%
    as.data.frame() %>%
    rownames_to_column() %>%
    setNames(c("sample", "gene")) %>%
    as_tibble() %>%
    bind_rows(s) %>%
    full_join(tresults, by = "sample") %>%
    mutate(type = if_else(str_detect(sample, "^m"), "real", "synthetic")) %>%
    ggplot() +
    geom_point(aes(V1, V2, colour = gene, shape = type)) +
    viridis::scale_color_viridis() +
    guides(colour = guide_colorbar(title = geneToPlot))
}
###

s <- str_detect(colnames(countsMgfp), "^s")
commonGenes <- intersect(rownames(countsMgfp), rownames(countsRegev))

sng <- cbind(countsMgfp[commonGenes, s], countsRegev[commonGenes, ])
mul <- countsMgfp[commonGenes, !s]

erccSng <- cbind(
  countsMgfpERCC[, s], 
  matrix(NA, nrow = nrow(countsMgfpERCC), ncol = ncol(countsRegev))
)
erccMul <- cbind(countsMgfpERCC[, !s])

boolMulC <- colnames(mul) %in% filter(countsMgfpMeta, tissue == "colon")$sample
boolSngC <- colnames(sng) %in% filter(countsMgfpMeta, tissue == "colon")$sample

#setup spCounts
cObjSng <- spCounts(sng, erccSng)
## is.na(c(counts, counts.ercc) returned TRUE
cObjMul <- spCounts(mul, erccMul)

#spUnsupervised
uObj <- spUnsupervised(cObjSng, seed = 2334, max_iter = 1000, max = 2000)
plotUnsupervisedClass(uObj, cObjSng) %>%
  plotData() %>%
  ggplot() +
  geom_point(
    aes(
      `t-SNE dim 1`, `t-SNE dim 2`, 
      colour = Classification, size = Uncertainty
    )
  ) + 
  geom_label(
    data = getData(uObj, "tsneMeans"),
    aes(x = x, y = y, label = classification)
  )

classes <- getData(uObj, "classification")

bool <- classes %in% c("B1", "K1", "R1", "T1", "F2")
c <- case_when(
  classes %in% c("B1", "K1") ~ "Colon.Stem",
  classes %in% c("R1", "T1", "F2") ~ "SI.Stem",
  TRUE ~ classes
)
combos <- apply(combn(unique(c[bool]), 2), 2, sort)
adjustment <- rep(1, length(unique(c[bool])))
names(adjustment) <- unique(c[bool])

n <- 30
names <- paste(
  apply(combos, 2, paste, collapse = "-"), 
  rep(1:n, each = ncol(combos)), 
  sep = "."
)

synth <- getData(cObjSng, "counts.cpm")[, bool] %>%
  norm() %>%
  {map(1:n, function(x) {
    syntheticMultipletsFromCounts(., c[bool], combos, adjustment, seed = x + 9789)
  })} %>%
  reduce(full_join, by = "gene") %>%
  setNames(c("gene", names)) %>%
  norm()

#tsne
set.seed(2343)
tresults <- getData(cObjMul, "counts") %>%
  norm() %>%
  matrix_to_tibble("gene") %>%
  full_join(synth, by = "gene") %>%
  as.data.frame() %>%
  column_to_rownames("gene") %>%
  as.matrix() %>%
  `+` (1) %>%
  log2(.) %>%
  pearsonsCor(., select = nTopMax(., 1000)) %>%
  Rtsne::Rtsne(., is_distance = TRUE, max_iter = 3000) %>%
  `[[` ('Y') %>% 
  matrix_to_tibble() %>%
  select(-rowname) %>%
  add_column(sample = c(colnames(getData(cObjMul, "counts")), colnames(synth[, -1])))

#plot results
tresults %>%
  mutate(type = if_else(str_detect(sample, "^m"), "real", "synthetic")) %>%
  ggplot() +
  geom_point(aes(V1, V2, colour = type))

slog <- synth %>% 
  as.data.frame() %>%
  column_to_rownames("gene") %>%
  as.matrix() %>%
  `+` (1) %>%
  log2() %>%
  matrix_to_tibble("gene") %>%
  gather(sample, counts, -gene)

plotTSNEgene("Lgr5", slog, cObjMul)

plotTSNEgene("Lyz1", slog, cObjMul)

plotTSNEgene("Alpi", slog, cObjMul)

plotTSNEgene("Muc2", slog, cObjMul)

plotTSNEgene("Slc40a1", slog, cObjMul)

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2                forcats_0.3.0                
##  [3] stringr_1.3.0                 dplyr_0.7.4                  
##  [5] purrr_0.2.4                   readr_1.1.1                  
##  [7] tidyr_0.8.0                   tibble_1.4.2                 
##  [9] ggplot2_2.2.1.9000            tidyverse_1.2.1              
## [11] ggthemes_3.4.2                printr_0.1                   
## [13] seqTools_0.0.0.9000           sp.scRNAseqTesting_0.0.0.9000
## [15] sp.scRNAseqData_0.0.1.0       sp.scRNAseq_0.0.1.90         
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137           matrixStats_0.53.1     doMC_1.3.5            
##  [4] lubridate_1.7.4        progress_1.1.2.9002    httr_1.3.1            
##  [7] rprojroot_1.3-2        tools_3.4.4            backports_1.1.2       
## [10] R6_2.2.2               lazyeval_0.2.1         BiocGenerics_0.24.0   
## [13] colorspace_1.3-2       withr_2.1.2            tidyselect_0.2.4      
## [16] prettyunits_1.0.2      gridExtra_2.3          mnormt_1.5-5          
## [19] curl_3.2               compiler_3.4.4         cli_1.0.0.9001        
## [22] rvest_0.3.2            xml2_1.2.0             labeling_0.3          
## [25] scales_0.5.0.9000      psych_1.8.3.3          digest_0.6.15         
## [28] foreign_0.8-69         rmarkdown_1.9          pkgconfig_2.0.1       
## [31] htmltools_0.3.6        rlang_0.2.0.9001       readxl_1.0.0          
## [34] rstudioapi_0.7         bindr_0.1.1            jsonlite_1.5          
## [37] mclust_5.3             magrittr_1.5           ansistrings_1.0.0.9000
## [40] Rcpp_0.12.16           munsell_0.4.3          S4Vectors_0.16.0      
## [43] viridis_0.5.1          stringi_1.1.7          yaml_2.1.18           
## [46] ggraph_1.0.1           MASS_7.3-49            Rtsne_0.13            
## [49] plyr_1.8.4             grid_3.4.4             parallel_3.4.4        
## [52] ggrepel_0.7.0          crayon_1.3.4           udunits2_0.13         
## [55] lattice_0.20-35        haven_1.1.1            hms_0.4.2             
## [58] knitr_1.20             pillar_1.2.1           igraph_1.2.1          
## [61] pso_1.0.3              reshape2_1.4.3         codetools_0.2-15      
## [64] stats4_3.4.4           glue_1.2.0             evaluate_0.10.1       
## [67] modelr_0.1.1           selectr_0.4-1          tweenr_0.1.5          
## [70] foreach_1.4.4          cellranger_1.1.0       gtable_0.2.0          
## [73] rematch2_2.0.1         assertthat_0.2.0       rgeolocate_1.0.1      
## [76] ggforce_0.1.1          openxlsx_4.0.17        broom_0.4.4           
## [79] tidygraph_1.1.0        e1071_1.6-8            class_7.3-14          
## [82] viridisLite_0.3.0      iterators_1.0.9        units_0.5-1